library(ggsci)
library(tidyverse)
library(dplyr)
library(forcats)
library(reshape2)
library(stringr)
library(tidyr)
library(tibble)
library(sangerseqR)
library(DECIPHER)
library(Biostrings)
library(phangorn)
library(ape)
library(ggplot2)
library(ggtree)
library(patchwork)
library(bioseq)
library(kmer)
library(GUniFrac)
library(seqinr)
library(vegan)
library(corrplot)
library(ggrepel)
library(ggmsa)
library(dendextend)
library(usedist)
#here is the raw SymPortal Data output that contains data for ITS2 type profiles and DIVs associated with all three coral species
seqs <- read_tsv("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.absolute.abund_and_meta.txt") %>%
mutate(plate = case_when(str_detect(sample_name, "Plate1") ~ "Plate1",
str_detect(sample_name, "Plate2") ~ "Plate2",
str_detect(sample_name, "Plate3") ~ "Plate3",
str_detect(sample_name, "Plate4") ~ "Plate4",
str_detect(sample_name, "Plate5") ~ "Plate5",
str_detect(sample_name, "Plate6") ~ "Plate6",
str_detect(sample_name, "Plate7") ~ "Plate7",
TRUE ~ "Other"),
position = str_sub(sample_name, start = 8, end = 11),
plate_position = paste0(plate, "_", position)) %>%
filter(!(is.na(sample_name))) %>%
select(sample_name = plate_position, `A1`:`1275234_G`) %>%
mutate(sample_name = as.factor(sample_name))
meta = read.csv("Metadata.csv") %>%
mutate(sample_name = as.factor(sample_name))
read_fasta_df <- function (file = "") {
fasta <- readLines(file)
ind <- grep(">", fasta)
s <- data.frame(ind = ind, from = ind + 1, to = c((ind -
1)[-1], length(fasta)))
seqs <- rep(NA, length(ind))
for (i in 1:length(ind)) {
seqs[i] <- paste(fasta[s$from[i]:s$to[i]], collapse = "")
}
tib <- tibble(label = gsub(">", "", fasta[ind]), sequence = seqs)
return(tib)
}
write_fasta_df <- function (data, filename)
{
fastaLines = c()
for (rowNum in 1:nrow(data)) {
fastaLines = c(fastaLines, as.character(paste(">",
data[rowNum, "label"], sep = "")))
fastaLines = c(fastaLines, as.character(data[rowNum,
"sequence"]))
}
fileConn <- file(filename)
writeLines(fastaLines, fileConn)
close(fileConn)
}
dna_to_DNAbin <- function (dna){
DNAbin <- as_DNAbin(dna)
names(DNAbin) <- names(dna)
return(DNAbin)
}
dna_to_DNAStringset <- function(x)
{
bioseq:::check_dna(x)
DNAstr <- DNAStringSet(paste(x))
names(DNAstr) <- names(x)
return(DNAstr)
}
DNAStringSet_to_dna <- function(x){
x_dna <- as_dna(paste(x))
names(x_dna) <- names(x)
res <- tibble(label = names(x), sequence = x_dna)
return(res)
}
# Convert DNAstringset to DNAbin
DNAStringSet_to_DNAbin <- function(DNAStringSet){
DNAbin <- as.DNAbin(DNAStringSet)
return(DNAbin)
}
# https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2315-y
palette <- c("A" = "#46ff2d",
"G" = "#ffae01",
"C" = "#f24641",
"T" = "#4294fa",
"K" = "#8b4816",
"M" = "#83831f",
"R" = "#ffff81",
"S" = "#ff9d80",
"Y" = "#e381f2",
"W" = "#80fff2",
"V" = "#fde4b8",
"B" = "#f9c1bf",
"H" = "#c0d9f9",
"D" = "#c7ffba",
"U" = "#8989fb",
"N" = "black",
"-" = "white",
"+" = "White")
pal_df <- data.frame(names = names(palette), col = palette)
# Convert to long format
seqs_long <- seqs %>%
filter(!is.na(sample_name)) %>%
select(sample_name, `A1`:`1275234_G`) %>%
pivot_longer(`A1`:`1275234_G`) %>%
filter(value > 0) %>% # Remove zero values
left_join(., meta)
# Q. Are we working with the post-med seqs according to the metadata in seqs?
san_check <- seqs_long %>%
group_by(sample_name) %>%
summarise(total = sum(value)) #A. yes
# Create a list of samples to keep that didnt fail to sequence
keepers_ss <- san_check %>%
filter(total > 1500)
#we filter out 3 samples with less than 1500 reads: Plate4_B007, Plate5_G011, Plate7_D009
# Filter out the failed samples
seqs_long <- seqs_long %>%
filter(sample_name %in% keepers_ss$sample_name) %>%
group_by(sample_name) %>%
mutate(value_rel = value/sum(value)) %>% # Convert to relative abundance
ungroup() %>%
mutate(name = as.factor(name)) # Make sample names a factor
# Create a random palette for each sequence
n <- length(levels(seqs_long$name))
seqs_pal = rainbow(n, s=.6, v=.9)[sample(1:n,n, replace = FALSE)]
names(seqs_pal) <- levels(seqs_long$name)
# Read in the profile data
profiles_raw <- read_tsv("20210612_marzonie/186_20211115_03_DBV_20211116T024440.profiles.absolute.abund_and_meta.txt", skip = 6) %>%
select(sample_name = `...2`, `A1/A1h`:`C42a/C1-C42.2`) %>%
mutate(plate = case_when(str_detect(sample_name, "Plate1") ~ "Plate1",
str_detect(sample_name, "Plate2") ~ "Plate2",
str_detect(sample_name, "Plate3") ~ "Plate3",
str_detect(sample_name, "Plate4") ~ "Plate4",
str_detect(sample_name, "Plate5") ~ "Plate5",
str_detect(sample_name, "Plate6") ~ "Plate6",
str_detect(sample_name, "Plate7") ~ "Plate7",
TRUE ~ "Other"),
position = str_sub(sample_name, start = 8, end = 11),
plate_position = paste0(plate, "_", position)) %>%
filter(!is.na(sample_name)) %>%
select(sample_name = plate_position, -plate, -position, `A1/A1h`:`C42a/C1-C42.2`)
#Convert to long format
profiles_long <- profiles_raw %>%
pivot_longer(`A1/A1h`:`C42a/C1-C42.2`) %>% # Convert it to long format
mutate(name = paste0("p_", name)) %>% # Add a p_ to the beginning of each profile (Some profiles are single sequence profiles and clash with the Sequence names)
filter(sample_name %in% seqs_long$sample_name) %>% # Remove samples that dont appear in the Sequence dataframe
group_by(sample_name) %>%
mutate(value = as.numeric(value)) %>%
filter(value > 0) %>% # Remove 0 abundance profiles
mutate(sample_name = as.factor(sample_name),
name = as.factor(name)) %>%
ungroup() %>%
left_join(., meta) # Add in metadata
# What is the total number of profile-related sequences in each sample?
profiles_sum <- profiles_long %>%
group_by(sample_name) %>%
summarise(total = sum(value))
# How many sequences in each sample are not part of a profile?
residual <- left_join(profiles_sum, san_check, by = "sample_name") %>%
mutate(residual = total.y - total.x) %>%
select(sample_name, value = residual) %>%
mutate(name = "non-profile sequences") %>%
left_join(., meta)
# Combine the profiles and non-profile sequences
profile_data <- rbind(profiles_long, residual) %>%
group_by(sample_name) %>%
mutate(value_rel = value/sum(value)) # convert to relative abundance - in that sample
# Create palette for profiles (this is a darker palette)
n <- length(levels(profile_data$name))
profile_pal = rainbow(n, s=.6, v=.6)[sample(1:n,n, replace = FALSE)]
names(profile_pal) <- levels(profile_data$name)
# Merge the palettes and replace the non-profile sequences with grey
all_pal <- c(seqs_pal, profile_pal)
all_pal['non-profile sequences'] <- "#808080"
# Join profiles and sequence data together into single dataframe and add more metadata
all_data <- rbind(seqs_long, profile_data) %>%
mutate(coral_genus = case_when(str_detect(mtORF, "mean|verru|aplo|nkno") ~ "Pocillopora",
str_detect(mtORF, "humil") ~ "Acropora",
TRUE ~ "check"))
all_data %>% filter(coral_genus != "Acropora") %>% distinct(sample_name)
## # A tibble: 346 × 1
## sample_name
## <fct>
## 1 Plate2_G008
## 2 Plate3_D008
## 3 Plate3_G007
## 4 Plate2_F001
## 5 Plate3_H008
## 6 Plate4_C005
## 7 Plate2_B004
## 8 Plate3_E009
## 9 Plate2_G006
## 10 Plate4_E004
## # … with 336 more rows
##Basic Stats
# How many samples per species?
all_data %>%
distinct(sample_name, mtORF) %>%
group_by(mtORF) %>%
summarise(total_samples = n())
## # A tibble: 5 × 2
## mtORF total_samples
## <chr> <int>
## 1 Ahumilis 260
## 2 Haplotype8a 39
## 3 Pmeandrina 133
## 4 Pverrucosa 152
## 5 Unknown 22
# Ahumilis 260
# Haplotype8a 39
# Pmeandrina 133
# Pverrucosa 152
# Unknown 22
humil_total <- 260
hap8a_total <- 39
veru_total <- 152
mean_total <- 133
#remove unknowns from downstream analysis
clean_data <- all_data %>%
filter(mtORF != "Unknown")
# Total number of sequences (for whole library)?
study_total <- clean_data %>%
filter(!(str_detect(name, "p_")),
name != "non-profile sequences") %>%
summarise(total_seqs = sum(value)) %>%
pull(total_seqs)
#15,336,586 total
#Total number of sequences (per species)?
clean_data %>%
filter(!(str_detect(name, "p_")),
name != "non-profile sequences") %>%
group_by(mtORF) %>%
summarise(total_seqs = sum(value))
## # A tibble: 4 × 2
## mtORF total_seqs
## <chr> <dbl>
## 1 Ahumilis 6627245
## 2 Haplotype8a 1025443
## 3 Pmeandrina 3512508
## 4 Pverrucosa 4171390
# Ahumilis 6627245
# Haplotype8a 1025443
# Pmeandrina 3512508
# Pverrucosa 4171390
# Average per sample sequencing depth? (per coral host genus, and per coral species)
# Average per sample across all spp
clean_data %>%
group_by(sample_name) %>%
dplyr::summarise(per_sample = sum(value)) %>%
dplyr::summarise(mean_all = mean(per_sample))
## # A tibble: 1 × 1
## mean_all
## <dbl>
## 1 52523.
# Across all: 52522.55
#average per sample per coral host genus
clean_data %>%
group_by(coral_genus, sample_name) %>%
summarise(per_sample = sum(value)) %>%
summarise(mean_all = mean(per_sample), n = n())
## # A tibble: 2 × 3
## coral_genus mean_all n
## <chr> <dbl> <int>
## 1 Acropora 50979. 260
## 2 Pocillopora 53761. 324
# Acropora 50978.81 260
# Pocillopora 53761.36 324
#average per sample per coral host species
clean_data %>%
group_by(mtORF, sample_name) %>%
summarise(per_sample = sum(value)) %>%
summarise(mean_all = mean(per_sample), n = n())
## # A tibble: 4 × 3
## mtORF mean_all n
## <chr> <dbl> <int>
## 1 Ahumilis 50979. 260
## 2 Haplotype8a 52587. 39
## 3 Pmeandrina 52820. 133
## 4 Pverrucosa 54887. 152
# Species
# Ahumilis 50978.81 260
# Haplotype8a 52586.82 39
# Pmeandrina 52819.67 133
# Pverrucosa 54886.71 152
##Number of sequences per symbiont genus
# Total number of Cladocopium
clean_data %>%
filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
filter(str_sub(name, 1, 1) == "C" | str_detect(name, "_C")) %>%
summarise(sum = sum(value))
## # A tibble: 1 × 1
## sum
## <dbl>
## 1 15335016
# 15335016
(15335016 / study_total) * 100 # 99.98976 Clado
## [1] 99.98976
#Total number of Symbiodinium
clean_data %>%
filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
filter(str_sub(name, 1, 1) == "A" | str_detect(name, "_A")) %>%
summarise(sum = sum(value))
## # A tibble: 1 × 1
## sum
## <dbl>
## 1 1552
# 1552 total seqs
(1552 / study_total) * 100 # 0.01011959 Symbio
## [1] 0.01011959
# Total number of Durusdinium sequences
clean_data %>%
filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
filter(str_sub(name, 1, 1) == "D" | str_detect(name, "_D")) %>%
summarise(sum = sum(value))
## # A tibble: 1 × 1
## sum
## <dbl>
## 1 13
# 13 total reads
(13 / study_total) * 100 # 8.476463e-05 Durusdinium
## [1] 8.476463e-05
#3.2 Characteristics of symbiont DIVs and Type Profiles
# general statements that apply to both species
# what proportion of each sample are composed of non-profile sequences (grouped by coral host species)
clean_data %>%
filter(str_detect(name, "non")) %>%
group_by(mtORF) %>%
summarise(mean = mean(value_rel))
## # A tibble: 4 × 2
## mtORF mean
## <chr> <dbl>
## 1 Ahumilis 0.245
## 2 Haplotype8a 0.176
## 3 Pmeandrina 0.179
## 4 Pverrucosa 0.139
# Ahumilis 0.2454587
# Haplotype8a 0.1763951
# Pmeandrina 0.1792804
# Pverrucosa 0.1393484
# TO DO FURTHER DOWN - humilis split, one side of the unifrac tree has a greater prop of non-profile seqs than the other
clean_data %>%
filter(str_detect(name, "non")) %>%
group_by(coral_genus) %>%
summarise(mean = mean(value_rel))
## # A tibble: 2 × 2
## coral_genus mean
## <chr> <dbl>
## 1 Acropora 0.245
## 2 Pocillopora 0.160
# Acropora 0.2454587
# Pocillopora 0.1601996
# Acropora has a lot more non-profile sequences than any Pocillopora.
library(tidyverse)
#total type profiles across Pocilloporidae
clean_data %>%
filter(coral_genus == "Pocillopora") %>%
filter(str_detect(name, "p_")) %>% #profiles start with p_
group_by(name) %>%
dplyr:: count() %>%
dplyr:: arrange(desc(n)) %>%
ungroup() %>%
mutate(prop = n/sum(n)) %>%
mutate(cumulative_sum = cumsum(prop))
## # A tibble: 26 × 4
## name n prop cumulative_sum
## <fct> <int> <dbl> <dbl>
## 1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 97 0.296 0.296
## 2 p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p 71 0.216 0.512
## 3 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 43 0.131 0.643
## 4 p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au 36 0.110 0.753
## 5 p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k 13 0.0396 0.793
## 6 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 11 0.0335 0.826
## 7 p_C42.2/C1dh/C1/C1d-C1b-C3cg 9 0.0274 0.854
## 8 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 6 0.0183 0.872
## 9 p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6 0.0183 0.890
## 10 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 5 0.0152 0.905
## # … with 16 more rows
# profile n_samps prop_samps cumulative_sum
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 97 0.295731707 0.2957317
# p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p 71 0.216463415 0.5121951
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 43 0.131097561 0.6432927
# p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au 36 0.109756098 0.7530488
# p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k 13 0.039634146 0.7926829
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 11 0.033536585 0.8262195
# p_C42.2/C1dh/C1/C1d-C1b-C3cg 9 0.027439024 0.8536585
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 6 0.018292683 0.8719512
# p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6 0.018292683 0.8902439
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 5 0.015243902 0.9054878
# p_C1d-C42.2-C1-C1k-C1b-C3cg 5 0.015243902 0.9207317
# p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az 5 0.015243902 0.9359756
# p_C1ag/C1/C42.2-C3cg-C1b-C1bi 4 0.012195122 0.9481707
# p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk 2 0.006097561 0.9542683
# p_C1d/C15h-C1-C42.2 2 0.006097561 0.9603659
# p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d 2 0.006097561 0.9664634
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c 2 0.006097561 0.9725610
# p_C3-C3k 1 0.003048780 0.9756098
# p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b 1 0.003048780 0.9786585
# p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l 1 0.003048780 0.9817073
# p_A1/A1h 1 0.003048780 0.9847561
# p_C15h 1 0.003048780 0.9878049
# p_C1ag/C1-C42.2-C3cg-C1b 1 0.003048780 0.9908537
# p_C1d 1 0.003048780 0.9939024
# p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg 1 0.003048780 0.9969512
# p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au 1 0.003048780 1.0000000
#total number of profiles, in order of highest abundance across P. verrucosa samples
clean_data %>%
filter(mtORF == "Pverrucosa") %>%
filter(str_detect(name, "p_")) %>% #profiles start with p_
group_by(name) %>%
dplyr:: count() %>%
dplyr:: arrange(desc(n)) %>%
ungroup() %>%
mutate(prop = n/sum(n)) %>%
mutate(cumulative_sum = cumsum(prop))
## # A tibble: 16 × 4
## name n prop cumulative_sum
## <fct> <int> <dbl> <dbl>
## 1 p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p 71 0.461 0.461
## 2 p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au 36 0.234 0.695
## 3 p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k 13 0.0844 0.779
## 4 p_C42.2/C1dh/C1/C1d-C1b-C3cg 8 0.0519 0.831
## 5 p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6 0.0390 0.870
## 6 p_C1d-C42.2-C1-C1k-C1b-C3cg 5 0.0325 0.903
## 7 p_C1ag/C1/C42.2-C3cg-C1b-C1bi 4 0.0260 0.929
## 8 p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk 2 0.0130 0.942
## 9 p_C1d/C15h-C1-C42.2 2 0.0130 0.955
## 10 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 1 0.00649 0.961
## 11 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 1 0.00649 0.968
## 12 p_C3-C3k 1 0.00649 0.974
## 13 p_C1ag/C1-C42.2-C3cg-C1b 1 0.00649 0.981
## 14 p_C1d 1 0.00649 0.987
## 15 p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c 1 0.00649 0.994
## 16 p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg 1 0.00649 1
verru_pros <- 22
# p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p 71 0.461038961 0.4610390
# p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au 36 0.233766234 0.6948052
# p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k 13 0.084415584 0.7792208
# p_C42.2/C1dh/C1/C1d-C1b-C3cg 8 0.051948052 0.8311688
# p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6 0.038961039 0.8701299
# p_C1d-C42.2-C1-C1k-C1b-C3cg 5 0.032467532 0.9025974
# p_C1ag/C1/C42.2-C3cg-C1b-C1bi 4 0.025974026 0.9285714
# p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk 2 0.012987013 0.9415584
# p_C1d/C15h-C1-C42.2 2 0.012987013 0.9545455
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 1 0.006493506 0.9610390
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 1 0.006493506 0.9675325
# p_C3-C3k 1 0.006493506 0.9740260
# p_C1ag/C1-C42.2-C3cg-C1b 1 0.006493506 0.9805195
# p_C1d 1 0.006493506 0.9870130
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c 1 0.006493506 0.9935065
# p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg 1 0.006493506 1.0000000
clean_data %>%
filter(mtORF == "Pmeandrina") %>%
filter(str_detect(name, "p_")) %>%
group_by(name) %>%
dplyr:: count() %>%
dplyr:: arrange(desc(n)) %>%
ungroup() %>%
mutate(prop = n/sum(n)) %>%
mutate(cumulative_sum = cumsum(prop))
## # A tibble: 12 × 4
## name n prop cumulative_sum
## <fct> <int> <dbl> <dbl>
## 1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 74 0.548 0.548
## 2 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 33 0.244 0.793
## 3 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 9 0.0667 0.859
## 4 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 5 0.0370 0.896
## 5 p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az 5 0.0370 0.933
## 6 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 3 0.0222 0.956
## 7 p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b 1 0.00741 0.963
## 8 p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l 1 0.00741 0.970
## 9 p_A1/A1h 1 0.00741 0.978
## 10 p_C15h 1 0.00741 0.985
## 11 p_C42.2/C1dh/C1/C1d-C1b-C3cg 1 0.00741 0.993
## 12 p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c 1 0.00741 1
mean_pros <- 12
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 74 0.548148148 0.5481481
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 33 0.244444444 0.7925926
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 9 0.066666667 0.8592593
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 5 0.037037037 0.8962963
# p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az 5 0.037037037 0.9333333
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 3 0.022222222 0.9555556
# p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b 1 0.007407407 0.9629630
# p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l 1 0.007407407 0.9703704
# p_A1/A1h 1 0.007407407 0.9777778
# p_C15h 1 0.007407407 0.9851852
# p_C42.2/C1dh/C1/C1d-C1b-C3cg 1 0.007407407 0.9925926
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c 1 0.007407407 1.0000000
clean_data %>%
filter(mtORF == "Haplotype8a") %>%
filter(str_detect(name, "p_")) %>% #profiles start with p_
group_by(name) %>%
dplyr:: count() %>%
dplyr:: arrange(desc(n)) %>%
ungroup() %>%
mutate(prop = n/sum(n)) %>%
mutate(cumulative_sum = cumsum(prop))
## # A tibble: 7 × 4
## name n prop cumulative_sum
## <fct> <int> <dbl> <dbl>
## 1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 22 0.564 0.564
## 2 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 9 0.231 0.795
## 3 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 2 0.0513 0.846
## 4 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 2 0.0513 0.897
## 5 p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d 2 0.0513 0.949
## 6 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 1 0.0256 0.974
## 7 p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au 1 0.0256 1
hap8a_pros <- 7
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 22 0.56410256 0.5641026
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d 9 0.23076923 0.7948718
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au 2 0.05128205 0.8461538
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au 2 0.05128205 0.8974359
# p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d 2 0.05128205 0.9487179
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d 1 0.02564103 0.9743590
# p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au 1 0.02564103 1.0000000
clean_data %>%
filter(mtORF == "Ahumilis") %>%
filter(str_detect(name, "p_")) %>% #profiles start with p_
group_by(name) %>%
dplyr:: count() %>%
dplyr:: arrange(desc(n)) %>%
ungroup() %>%
mutate(prop = n/sum(n)) %>%
mutate(cumulative_sum = cumsum(prop))
## # A tibble: 23 × 4
## name n prop cumulative_sum
## <fct> <int> <dbl> <dbl>
## 1 p_C3k/C3-C50a-C29-C21ab-C3b 160 0.593 0.593
## 2 p_C3k/C3-C50a-C21ab-C50f-C3ba-C3dq 52 0.193 0.785
## 3 p_C3k/C3-C50a-C3ba-C50f-C3dq-C21-C3a 14 0.0519 0.837
## 4 p_C3k/C3-C50a-C3jv-C3vx-C3vy 12 0.0444 0.881
## 5 p_C3bo/C3k-C3-C3bp-C50a-C29 6 0.0222 0.904
## 6 p_C3k/C3/C1-C50a-C21ab 4 0.0148 0.919
## 7 p_C3k-C3-C50a-C21ab-C3b 3 0.0111 0.930
## 8 p_C3k-C3-C50a-C21ab-C50f-C3ba 2 0.00741 0.937
## 9 p_C1/C42.2 2 0.00741 0.944
## 10 p_C1/C1c 2 0.00741 0.952
## # … with 13 more rows
humil_pros <- 23
# p_C3k/C3-C50a-C29-C21ab-C3b 160 0.592592593 0.5925926
# p_C3k/C3-C50a-C21ab-C50f-C3ba-C3dq 52 0.192592593 0.7851852
# p_C3k/C3-C50a-C3ba-C50f-C3dq-C21-C3a 14 0.051851852 0.8370370
# p_C3k/C3-C50a-C3jv-C3vx-C3vy 12 0.044444444 0.8814815
# p_C3bo/C3k-C3-C3bp-C50a-C29 6 0.022222222 0.9037037
# p_C3k/C3/C1-C50a-C21ab 4 0.014814815 0.9185185
# p_C3k-C3-C50a-C21ab-C3b 3 0.011111111 0.9296296
# p_C3k-C3-C50a-C21ab-C50f-C3ba 2 0.007407407 0.9370370
# p_C1/C42.2 2 0.007407407 0.9444444
# p_C1/C1c 2 0.007407407 0.9518519
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3 1 0.003703704 0.9555556
# p_A1 1 0.003703704 0.9592593
# p_C3/C3k-C29-C21ab-C3b-C3gj-C21.12 1 0.003703704 0.9629630
# p_C3-C21-C3k-C3at-C3b-C3av-C3dp 1 0.003703704 0.9666667
# p_C40/C1-C3-C115 1 0.003703704 0.9703704
# p_C42.2/C42a/C1-C1b 1 0.003703704 0.9740741
# p_C1d/C1 1 0.003703704 0.9777778
# p_C1/C3k-C1b-C3-C42.2-C1bh-C1br 1 0.003703704 0.9814815
# p_C3/C21/C3av-C3at-C3b-C3dp 1 0.003703704 0.9851852
# p_C3k-C50a-C3cz 1 0.003703704 0.9888889
# p_C42a/C1-C42.2 1 0.003703704 0.9925926
# p_C1/C3-C1c-C1b-C1w 1 0.003703704 0.9962963
# p_C3k/C50a 1 0.003703704 1.0000000
#What is the proportion of samples that had 1 and 2 type profiles?
clean_data %>%
filter(coral_genus == "Pocillopora") %>%
filter(str_detect(name, "p_")) %>%
group_by(sample_name) %>%
summarise(n = n()) %>%
filter(n == 1) #320 samples have 1 type profile
## # A tibble: 320 × 2
## sample_name n
## <fct> <int>
## 1 Plate1_A001 1
## 2 Plate1_A002 1
## 3 Plate1_A003 1
## 4 Plate1_A004 1
## 5 Plate1_A005 1
## 6 Plate1_A006 1
## 7 Plate1_A008 1
## 8 Plate1_A009 1
## 9 Plate1_A010 1
## 10 Plate1_A011 1
## # … with 310 more rows
#filter(n == 2) #4 samples have 2 type profiles
clean_data %>%
filter(coral_genus == "Acropora") %>%
filter(str_detect(name, "p_")) %>%
group_by(sample_name) %>%
summarise(n = n()) %>%
#filter(n == 1) #250 samples have 1 type profile
filter(n == 2) #10 samples have 2 type profiles
## # A tibble: 10 × 2
## sample_name n
## <fct> <int>
## 1 Plate6_A001 2
## 2 Plate6_D009 2
## 3 Plate6_F009 2
## 4 Plate6_F011 2
## 5 Plate7_A006 2
## 6 Plate7_A010 2
## 7 Plate7_B010 2
## 8 Plate7_D006 2
## 9 Plate7_E006 2
## 10 Plate7_H009 2
#pverrucosa
clean_data %>%
filter(mtORF == "Pverrucosa") %>%
filter(str_detect(name, c("p_"))) %>%
filter(str_detect(name, c("p_C1d")))
## # A tibble: 128 × 47
## sample_name name value Refer…¹ Vial Species mtORF psba_…² psba_…³ psba_…⁴
## <fct> <fct> <dbl> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Plate4_E003 p_C1d/… 27948 426 P ve… Pverru… Pver… "" "" ""
## 2 Plate1_E006 p_C1d/… 18997 459 P ve… Pverru… Pver… "" "" ""
## 3 Plate4_D005 p_C1d/… 18172 448 P ve… Pverru… Pver… "PverC… "Bad" "Bad"
## 4 Plate2_C011 p_C1d/… 27531 519 P ve… Pverru… Pver… "" "" ""
## 5 Plate4_A003 p_C1d/… 26506 420 P ve… Pverru… Pver… "" "" ""
## 6 Plate1_F011 p_C1d/… 25657 541 P ve… Pverru… Pver… "" "" ""
## 7 Plate1_G008 p_C1d/… 32542 480 P ve… Pverru… Pver… "" "" ""
## 8 Plate2_H010 p_C1d/… 21499 515 P ve… Pverru… Pver… "" "" ""
## 9 Plate3_C004 p_C1d/… 19773 425 P ve… Pverru… Pver… "" "" ""
## 10 Plate1_C011 p_C1d/… 29084 538 P ve… Pverru… Pver… "" "" ""
## # … with 118 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## # mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## # DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## # Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## # meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## # DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## # returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
128 / veru_total
## [1] 0.8421053
#found as majority sequence in 84.2% of P. verrucosa samples
#p.mea/haplotype 8a
clean_data %>%
filter(mtORF == "Pmeandrina" | mtORF == "Haplotype8a") %>%
filter(str_detect(name, c("p_"))) %>%
filter(str_detect(name, c("p_C1|p_C42.2")))
## # A tibble: 153 × 47
## sample_name name value Refer…¹ Vial Species mtORF psba_…² psba_…³ psba_…⁴
## <fct> <fct> <dbl> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Plate2_G008 p_C1/C… 26362 365 P ve… Pmeand… Pmea… "" "" ""
## 2 Plate3_D008 p_C1/C… 20767 591 P ve… Unknown Hapl… "PunkC… "Bad" "Bad"
## 3 Plate3_G007 p_C1/C… 25118 360 P ve… Pmeand… Pmea… "" "" ""
## 4 Plate2_F001 p_C1/C… 23410 277 P ve… Pmeand… Pmea… "" "" ""
## 5 Plate3_H008 p_C1/C… 29095 595 P ve… Unknown Hapl… "" "" ""
## 6 Plate4_C005 p_C1/C… 19389 329 P ve… Pmeand… Pmea… "" "" ""
## 7 Plate2_B004 p_C1/C… 26328 315 P ve… Pmeand… Pmea… "" "" ""
## 8 Plate3_E009 p_C1/C… 29186 597 P ve… Unknown Hapl… "" "" ""
## 9 Plate2_G006 p_C1/C… 22104 585 P ve… Unknown Hapl… "PunkC… "Good" "Bad"
## 10 Plate4_E004 p_C1/C… 24129 327 P ve… Pmeand… Pmea… "" "" ""
## # … with 143 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## # mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## # DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## # Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## # meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## # DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## # returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
153 /(mean_total + hap8a_total) # 0.8895349
## [1] 0.8895349
#acropora
clean_data %>%
filter(mtORF == "Ahumilis") %>%
filter(str_detect(name, c("p_"))) %>%
filter(str_detect(name, c("p_C3k|p_C3")))
## # A tibble: 258 × 47
## sample_name name value Refer…¹ Vial Species mtORF psba_…² psba_…³ psba_…⁴
## <fct> <fct> <dbl> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Plate5_C011 p_C3k/… 20465 208 A te… Ahumil… Ahum… "" "" ""
## 2 Plate5_F007 p_C3k/… 27117 129 A te… Ahumil… Ahum… "" "" ""
## 3 Plate5_D012 p_C3k/… 13819 244 A te… Ahumil… Ahum… "" "" ""
## 4 Plate7_B007 p_C3k/… 22306 205 A te… Ahumil… Ahum… "AcroC… "Good" "Good"
## 5 Plate7_F002 p_C3k/… 23441 88 A te… Ahumil… Ahum… "" "" ""
## 6 Plate6_A010 p_C3k/… 24848 204 A te… Ahumil… Ahum… "" "" ""
## 7 Plate7_G004 p_C3k/… 17492 151 A te… Ahumil… Ahum… "" "" ""
## 8 Plate7_A004 p_C3k/… 24349 136 A te… Ahumil… Ahum… "" "" ""
## 9 Plate7_G009 p_C3k/… 22047 253 A te… Ahumil… Ahum… "" "" ""
## 10 Plate7_F007 p_C3k/… 7075 217 A te… Ahumil… Ahum… "" "" ""
## # … with 248 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## # mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## # DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## # Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## # meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## # DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## # returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
258 / humil_total
## [1] 0.9923077
#C3k/C3 is majority sequence in 0.9923077 of all samples
Table S1.
#How many samples collected from each reef, for each type of coral species?
meta %>% dplyr::select(c(Vial, Reef, mtORF)) %>%
filter(mtORF != "Unknown") %>%
group_by(Reef, mtORF) %>%
summarise(total_samples = n()) %>%
pivot_wider(names_from = mtORF, values_from = total_samples)
## # A tibble: 13 × 5
## # Groups: Reef [13]
## Reef Ahumilis Haplotype8a Pmeandrina Pverrucosa
## <chr> <int> <int> <int> <int>
## 1 Bougainville 22 6 10 18
## 2 Chilcott 18 2 10 3
## 3 Flinders 30 2 5 26
## 4 Frederick 17 2 13 2
## 5 Herald 19 3 5 11
## 6 Holmes 30 2 7 20
## 7 Lihou 32 1 6 18
## 8 Marion 36 6 11 13
## 9 Moore 20 3 19 10
## 10 Osprey 14 NA 4 11
## 11 Saumarez NA 5 22 NA
## 12 Willis NA 1 9 14
## 13 Wreck 24 6 13 6
seq_data <- clean_data %>%
filter(!str_detect(name, "non")) %>%
filter(!str_detect(name, "p_"))
#creating an object for each species
poci_seqs <- seq_data %>% filter(coral_genus == "Pocillopora")
pver_seqs <- seq_data %>% filter(str_detect(mtORF, "Pverrucosa"))
pmh8_seqs <- seq_data %>% filter(str_detect(mtORF, "Pmeandrina|Haplotype8a"))
acro_seqs <- seq_data %>% filter(str_detect(mtORF, "Ahumilis"))
#read in file
fasta_poci <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
filter(label %in% poci_seqs$name) %>% #only keeping DNA seqs that appear in seqs_long subset
filter(!str_detect(label, "A|G")) %>%
deframe() %>%
as_dna()
#creating the tree
kdist_poci <- fasta_poci %>%
dna_to_DNAbin() %>%
kdistance(k = 7, residues = "DNA", method = "edgar") %>%
as.matrix()
k_tree_poci <- kdist_poci %>% phangorn::upgma()
seqs_wide_poci <- poci_seqs %>%
dplyr::select(sample_name, name, value) %>%
filter(!str_detect(name, "A|G")) %>%
pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
tibble::column_to_rownames(var = "sample_name")
k_unidist_poci <- GUniFrac(seqs_wide_poci, k_tree_poci) #GUniFrac calculates all the distances
k_unidist_poci <- k_unidist_poci$unifracs
du_poci <- k_unidist_poci[, , "d_0.5"] # GUniFrac with alpha 0.5
dist_poci <- as.dist(du_poci, diag = FALSE)
# Cluster the samples
hclust_samps_poci <- upgma(du_poci)
# Make the sample tree
tree_poci <- ggtree(hclust_samps_poci, size = 0.2) +
theme(aspect.ratio = 0.3)
# Get a sample order from ggtree
poci_sample_order <- tree_poci$data %>% filter(isTip == "TRUE") %>%
arrange(y) %>%
pull(label)
pocimat <- meta %>% filter(sample_name %in% tree_poci$data$label) %>%
select(sample_name, RFLP = Species, mtORF) %>%
mutate(RFLP = case_when(RFLP == "Pmeandrina" ~ "P. meandrina",
RFLP == "Pverrucosa" ~ "P. verrucosa", TRUE ~ RFLP),
mtORF = case_when(mtORF == "Pmeandrina" ~ "P. meandrina",
mtORF == "Pverrucosa" ~ "P. verrucosa", TRUE ~ mtORF)) %>%
tibble::column_to_rownames(var = "sample_name")
poci_tree_mat <- gheatmap(tree_poci, pocimat, colnames = T, width=.1, offset = -0.26)
poci_tree_mat <- poci_tree_mat + scale_fill_d3(palette = "category10", na.value = "grey90") + layout_dendrogram() + theme(legend.position = "left")
# Start plotting the composition data
plot_df_poci <- clean_data %>%
filter(coral_genus == "Pocillopora") %>%
mutate(sample_name = fct_relevel(sample_name, poci_sample_order))
theme_set(theme_bw())
# find the likely distinguishing seqs in here
test_df <- pmh8_seqs %>%
group_by(name) %>%
summarise(mean = mean(value_rel), n = n()) %>%
arrange(desc(n), desc(mean))
# colour them black to check
test_pal <- all_pal
test_pal['C42g'] <- "#000000"
bar_uni_poci <-
ggplot(plot_df_poci, aes(sample_name, value_rel)) +
geom_bar(stat = "identity", aes(fill = name, colour = name)) +
theme(aspect.ratio = 0.5, legend.position = "none", axis.text.y=element_blank(), axis.ticks.y = element_blank(),
axis.text.x=element_blank(), axis.ticks.x = element_blank(),
axis.title.x = element_blank(), axis.title.y = element_blank(),
panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.ticks = element_blank()) +
scale_fill_manual(values = all_pal, breaks = levels(profile_data$name)) +
scale_colour_manual(values = test_pal, breaks = levels(profile_data$name)) +
geom_hline(yintercept = 1, size = 1) +
guides(fill=guide_legend(ncol=2))
bar_uni_poci
#p_bar_uni is the sequences by colour. P_tree_tip is the tree coloured by reef.
poci_tree_mat / bar_uni_poci
OBSERVATIONS: Regardless of the tree-type and unifrac combination, there is one major branch point in the tree. While this does not seem to be consistent with lat/lon, the unweighted approach seems very good at placing samples from the same lat/lons together.
#read in file
fasta_acro <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
filter(label %in% acro_seqs$name) %>% #only keeping DNA seqs that appear in seqs_long subset
filter(str_sub(label, 1, 1) == "C" | str_detect(label, "_C")) %>%
deframe() %>%
as_dna()
# Unifracs With a kmer-based tree
kdist_acro <- fasta_acro %>%
dna_to_DNAbin() %>%
kdistance(k = 7, residues = "DNA", method = "edgar") %>%
as.matrix()
k_tree_acro <- kdist_acro %>% phangorn::upgma()
# Get the community matrix
seqs_wide_acro <- acro_seqs %>%
dplyr::select(sample_name, name, value) %>%
filter(str_sub(name, 1, 1) == "C" | str_detect(name, "_C")) %>%
pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
tibble::column_to_rownames(var = "sample_name")
# comput unifrac distances
k_unidist_acro <- GUniFrac(seqs_wide_acro, k_tree_acro) #GUniFrac calculates all the distances
k_unidist_acro <- k_unidist_acro$unifracs
du_acro <- k_unidist_acro[, , "d_0.5"] # GUniFrac with alpha 0.5
dist_acro <- as.dist(du_acro, diag = FALSE)
# Cluster the samples
hclust_samps_acro <- upgma(du_acro)
# Make the sample tree
tree_acro <- ggtree(hclust_samps_acro, size = 0.2) +
theme(aspect.ratio = 0.3) + layout_dendrogram()
# Get a sample order from ggtree
acro_sample_order <- tree_acro$data %>% filter(isTip == "TRUE") %>%
arrange(y) %>%
pull(label)
# Start plotting the composition data
plot_df_acro <- clean_data %>%
filter(str_detect(Species, "Ahumilis")) %>%
mutate(sample_name = fct_relevel(sample_name, acro_sample_order))
theme_set(theme_bw())
# find the likely distinguishing seqs in here
test_df <- acro_seqs %>%
group_by(name) %>%
summarise(mean = mean(value_rel), n = n()) %>%
arrange(desc(n), desc(mean))
# colour them black to check
test_pal <- all_pal
test_pal['C1c'] <- "#000000"
bar_uni_acro <-
ggplot(plot_df_acro, aes(sample_name, value_rel)) +
geom_bar(stat = "identity", aes(fill = name, colour = name)) +
theme(aspect.ratio = 0.5, legend.position = "none", axis.text.y=element_blank(), axis.ticks.y = element_blank(),
axis.text.x=element_blank(), axis.ticks.x = element_blank(),
axis.title.x = element_blank(), axis.title.y = element_blank(),
panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.ticks = element_blank()) +
scale_fill_manual(values = test_pal, breaks = levels(profile_data$name)) +
scale_colour_manual(values = test_pal, breaks = levels(profile_data$name)) +
geom_hline(yintercept = 1, size = 1) +
guides(fill=guide_legend(ncol=2))
#p_bar_uni is the sequences by colour. P_tree_tip is the tree coloured by reef.
tree_acro / bar_uni_acro
fasta_pver <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
filter(label %in% pver_seqs$name) %>% #only keeping DNA seqs that appear in seqs_long subset
filter(!str_detect(label, "A|G")) %>%
deframe() %>%
as_dna()
#creating the tree
kdist_pver <- fasta_pver %>%
dna_to_DNAbin() %>%
kdistance(k = 7, residues = "DNA", method = "edgar") %>%
as.matrix()
k_tree_pver <- kdist_pver %>% phangorn::upgma()
seqs_wide_pver <- pver_seqs %>%
dplyr::select(sample_name, name, value) %>%
filter(!str_detect(name, "A|G")) %>%
pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
tibble::column_to_rownames(var = "sample_name")
k_unidist_pver <- GUniFrac(seqs_wide_pver, k_tree_pver) #GUniFrac calculates all the distances
k_unidist_pver <- k_unidist_pver$unifracs
du_pver <- k_unidist_pver[, , "d_0.5"] # GUniFrac with alpha 0.5
dist_pver <- as.dist(du_pver, diag = FALSE)
fasta_pmh8 <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
filter(label %in% pmh8_seqs$name) %>% #only keeping DNA seqs that appear in seqs_long subset
filter(!str_detect(label, "A|G")) %>%
deframe() %>%
as_dna()
#creating the tree
kdist_pmh8 <- fasta_pmh8 %>%
dna_to_DNAbin() %>%
kdistance(k = 7, residues = "DNA", method = "edgar") %>%
as.matrix()
k_tree_pmh8 <- kdist_pmh8 %>% phangorn::upgma()
seqs_wide_pmh8 <- pmh8_seqs %>%
dplyr::select(sample_name, name, value) %>%
filter(!str_detect(name, "A|G")) %>%
pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
tibble::column_to_rownames(var = "sample_name")
k_unidist_pmh8 <- GUniFrac(seqs_wide_pmh8, k_tree_pmh8) # GUniFrac calculates all the distances
k_unidist_pmh8 <- k_unidist_pmh8$unifracs
du_pmh8 <- k_unidist_pmh8[, , "d_0.5"] # GUniFrac with alpha 0.5
dist_pmh8 <- as.dist(du_pmh8, diag = FALSE)
#psba txt files forward and reverse
f_all <- sort(list.files(path = "psba txt files/", pattern = "Forw.txt", full.names = TRUE))
r_all <- sort(list.files(path = "psba txt files/", pattern = "Rev.txt", full.names = TRUE))
f_all_fasta <- f_all %>%
map(read_fasta_df) %>% # read in all the files
purrr::reduce(rbind) # reduce with rbind into one dataframe
r_all_fasta <- r_all %>%
map(read_fasta_df) %>% # read in all the files
purrr::reduce(rbind) # reduce with rbind into one dataframe
# apply the trimming parameters to start and end of the sequences to improve consensus
f_all_fasta_sub <- f_all_fasta %>%
mutate(sequence = case_when(str_detect(label, "psba1_A12") ~ str_sub(sequence, start = 30, end = 550),
str_detect(label, "psba1_B02") ~ str_sub(sequence, start = 30, end = 700),
str_detect(label, "psba1_B07") ~ str_sub(sequence, start = 30, end = 600),
str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 650),
str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 600),
str_detect(label, "psba1_B10") ~ str_sub(sequence, start = 30, end = 650),
str_detect(label, "psba1_C12") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_D10") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_D11") ~ str_sub(sequence, start = 30, end = 545),
TRUE ~ str_sub(sequence, start = 30, end = 800)))
r_all_fasta_sub <- r_all_fasta %>%
mutate(sequence = case_when(str_detect(label, "psba1_A04") ~ str_sub(sequence, start = 30, end = 450),
str_detect(label, "psba1_A06") ~ str_sub(sequence, start = 30, end = 400),
str_detect(label, "psba1_A07") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_A08") ~ str_sub(sequence, start = 30, end = 450),
str_detect(label, "psba1_A08") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_A09") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_A12") ~ str_sub(sequence, start = 30, end = 550),
str_detect(label, "psba1_B02") ~ str_sub(sequence, start = 30, end = 300),
str_detect(label, "psba1_B03") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_B06") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 450),
str_detect(label, "psba1_B10") ~ str_sub(sequence, start = 30, end = 450),
str_detect(label, "psba1_B11") ~ str_sub(sequence, start = 30, end = 450),
str_detect(label, "psba1_C04") ~ str_sub(sequence, start = 30, end = 225),
str_detect(label, "psba1_D05") ~ str_sub(sequence, start = 30, end = 300),
str_detect(label, "psba1_D11") ~ str_sub(sequence, start = 30, end = 425),
str_detect(label, "psba1_E03") ~ str_sub(sequence, start = 30, end = 550),
str_detect(label, "psba1_F02") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_F08") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_G01") ~ str_sub(sequence, start = 30, end = 300),
str_detect(label, "psba1_H01") ~ str_sub(sequence, start = 30, end = 550),
str_detect(label, "psba1_H02") ~ str_sub(sequence, start = 30, end = 500),
str_detect(label, "psba1_H10") ~ str_sub(sequence, start = 30, end = 450),
TRUE ~ str_sub(sequence, start = 30, end = 600)))
# convert to format for DECIPHER and reverse complement the reverse sequence
f_dss <- f_all_fasta_sub %>%
deframe() %>%
as_dna() %>%
dna_to_DNAStringset()
r_dss <- r_all_fasta_sub %>%
deframe() %>%
as_dna() %>%
dna_to_DNAStringset() %>%
reverseComplement()
pair_list <- list()
for(i in 1:length(f_dss)){
ss <- DNAStringSet(c(f_dss[i], r_dss[i]))
pair_list[[i]] <- ss
}
# Align the sample pairs
alignment_list <- list()
for(i in 1:length(pair_list)){
alignment <- AlignSeqs(pair_list[[i]], verbose = FALSE)
alignment_list[[i]] <- alignment
}
# View the alignments
aligned_df <- data.frame()
for(i in 1:length(alignment_list)){
a_pair <- alignment_list[[i]] %>% writeXStringSet("temp_file.fasta")
a_df <- read_fasta_df("temp_file.fasta")
aligned_df <- rbind(aligned_df, a_df)
}
aligned_plotting <- aligned_df %>%
mutate(sample_id = str_sub(label, 23, 25)) %>% # create a metadata column that identifies the sample
mutate(label = str_sub(label, 23, 31))
# Create profile key
key <- aligned_plotting %>%
tibble::rownames_to_column(var = "id")
# Create long dataframe for ggplot
long_sequences <- str_split(aligned_plotting$sequence, "") %>%
reshape2::melt() %>%
group_by(L1) %>%
mutate(x = row_number(),
L1 = as.character(L1)) %>%
left_join(., key, by = c("L1" = "id")) %>%
ungroup()
# Plot alignment
ggplot(long_sequences, aes(y = label, x = x)) +
geom_tile(aes(fill = value), size = 1, name = "base") +
facet_wrap(~ sample_id, nrow = 12, scales = "free_y") +
scale_fill_manual(values = palette) +
theme(aspect.ratio = 0.3,
axis.title.y = element_blank()) +
scale_x_continuous(expand = c(0, 0)) +
xlab("Position")
# Sample list that we may want to exclude based on f and r alignments: B12, C04, G01, D06, D11
exclude_list <- c("B12", "D06", "C04", "F02", "B02", "B03")
align_list_final <- list()
for(i in 1:length(alignment_list)){
if(str_sub(names(alignment_list[[i]][1]), 23, 25) %in% exclude_list) align_list_final[[i]] <- NULL else align_list_final[[i]] <- alignment_list[[i]]
}
align_list_final <- Filter(Negate(is.null), align_list_final)
consensus_df <- list()
for(i in 1:length(align_list_final)){
a_con <- align_list_final[[i]] %>% ConsensusSequence()
names(a_con) <- names(align_list_final[[i]])[1]
consensus_df[[i]] <- a_con
}
pocillo_meta <- meta %>%
filter(mtORF == "Pmeandrina" | mtORF == "Pverrucosa" | mtORF == "Haplotype8a") %>%
filter(str_detect(psba_ID, "psb"))
pocillo_names <- pocillo_meta %>% pull(psba_ID)
pocillo_cons <- list()
for(i in 1:length(consensus_df)){
if(str_sub(names(consensus_df[[i]]), 17, 25) %in% pocillo_names) pocillo_cons[[i]] <- consensus_df[[i]] else pocillo_cons[[i]] <- NULL
}
# Keep only the pocillo psba sequences
pocillo_cons <- Filter(Negate(is.null), pocillo_cons)
# Get consensus seqs
pocillo_df <- data.frame()
for(i in 1:length(pocillo_cons)){
pocillo_cons[[i]] %>% writeXStringSet("temp_file.fasta")
t_df <- read_fasta_df("temp_file.fasta")
pocillo_df <- rbind(pocillo_df, t_df)
}
# Kmer based distance matrix
dis_psbaID <- pocillo_df %>%
mutate(psba_ID = str_sub(label, 17, 25)) %>%
left_join(., pocillo_meta) %>%
select(sample_name, sequence) %>%
deframe() %>%
DNAStringSet() %>%
DNAStringSet_to_DNAbin() %>%
kmer::kdistance(k = 7) %>%
as.matrix()
# Produce hierarchical tree
psba_upgma_tree <- phangorn::upgma(dis_psbaID)
# Subset the full matrix
du_poci_sub <- dist_subset(du_poci, psba_upgma_tree$tip.label) %>%
as.matrix()
# Produce hierarchical tree
ITS2_upgma_tree <- phangorn::upgma(du_poci_sub)
untang <- untangle(dend1 = ITS2_upgma_tree %>% as.dendrogram(), dend2 = psba_upgma_tree %>% as.dendrogram, method = "step2side")
tgram <- tanglegram(untang, margin_inner = 8)
entanglement(tgram) # 0.01193893
## [1] 0.01193893
t_order <- labels(tgram$dend1)
s_order <- labels(tgram$dend2)
tt <- as.phylo(tgram$dend1)
tt <- midpoint(tt)
st <- as.phylo(tgram$dend2)
st <- midpoint(st)
tree_meta <- pocillo_meta %>%
filter(sample_name %in% psba_upgma_tree$tip.label) %>%
mutate(new_lab = paste0(sample_name, "_", mtORF))
gtt <- ggtree(tt, ladderize = FALSE)
gtt$data <- gtt$data %>% left_join(., tree_meta, by = c("label" = "sample_name"))
gtt <- gtt + geom_tiplab(size = 3, aes(label = label)) + geom_tippoint(aes(fill = mtORF), shape = 21, size = 5) + ggsci::scale_fill_d3(palette = "category20")
gst <- ggtree(st, ladderize = FALSE)
dtt <- gtt$data
dst <- gst$data
dtt$tree <- "ITS2"
dst$tree <- "psbA"
dst$x <- max(dst$x) - dst$x + max(dtt$x) + max(dtt$x) * 1
pp <- gtt + geom_tree(data = dst)
dd <- bind_rows(dtt, dst) %>%
filter(isTip == TRUE)
dd1 <- as.data.frame(dd)
p_ip <- pp + geom_line(aes(x, y, group = label), size = 1.2, data = dd1) +
ggsci::scale_colour_d3(palette = "category20") +
theme(legend.position = "none")
p_ip
acro_meta <- meta %>%
filter(mtORF == "Ahumilis") %>%
filter(str_detect(psba_ID, "psb"))
acro_names <- acro_meta %>% pull(psba_ID)
acro_cons <- list()
for(i in 1:length(consensus_df)){
if(str_sub(names(consensus_df[[i]]), 17, 25) %in% acro_names) acro_cons[[i]] <- consensus_df[[i]] else acro_cons[[i]] <- NULL
}
# Keep only the pocillo psba sequences
acro_cons <- Filter(Negate(is.null), acro_cons)
# Get consensus seqs
acro_df <- data.frame()
for(i in 1:length(acro_cons)){
acro_cons[[i]] %>% writeXStringSet("temp_file.fasta")
t_df <- read_fasta_df("temp_file.fasta")
acro_df <- rbind(acro_df, t_df)
}
# Kmer based distance matrix
dis_psbaID <- acro_df %>%
mutate(psba_ID = str_sub(label, 17, 25)) %>%
left_join(., acro_meta) %>%
select(sample_name, sequence) %>%
deframe() %>%
DNAStringSet() %>%
DNAStringSet_to_DNAbin() %>%
kmer::kdistance(k = 7) %>%
as.matrix()
# Produce hierarchical tree
psba_upgma_tree <- phangorn::upgma(dis_psbaID)
# Subset the full matrix
du_acro_sub <- dist_subset(du_acro, psba_upgma_tree$tip.label) %>%
as.matrix()
# Produce hierarchical tree
ITS2_upgma_tree <- phangorn::upgma(du_acro_sub)
untang <- untangle(dend1 = ITS2_upgma_tree %>% as.dendrogram(), dend2 = psba_upgma_tree %>% as.dendrogram, method = "step2side")
tgram <- tanglegram(untang, margin_inner = 8)
entanglement(tgram) # 0.1986148
## [1] 0.1986148
t_order <- labels(tgram$dend1)
s_order <- labels(tgram$dend2)
tt <- as.phylo(tgram$dend1)
tt <- midpoint(tt)
st <- as.phylo(tgram$dend2)
st <- midpoint(st)
tree_meta <- acro_meta %>%
filter(sample_name %in% psba_upgma_tree$tip.label) %>%
mutate(new_lab = paste0(sample_name, "_", mtORF))
gtt <- ggtree(tt, ladderize = FALSE)
gtt$data <- gtt$data %>% left_join(., tree_meta, by = c("label" = "sample_name"))
gtt <- gtt + geom_tiplab(size = 3, aes(label = label))
gst <- ggtree(st, ladderize = FALSE)
dtt <- gtt$data
dst <- gst$data
dtt$tree <- "ITS2"
dst$tree <- "psbA"
dst$x <- max(dst$x) - dst$x + max(dtt$x) + max(dtt$x) * 1
pp <- gtt + geom_tree(data = dst)
dd <- bind_rows(dtt, dst) %>%
filter(isTip == TRUE)
dd1 <- as.data.frame(dd)
p_ip <- pp + geom_line(aes(x, y, group = label), size = 1.2, data = dd1)
tgram <- tanglegram(untang, margin_inner = 8)
p_ip
Here we are running a PCoA on UniFrac distances for the three communities associated with three different host groups (Pocillopora verrucosa, P. meandrina, and Acropora humilis) We are looking for patterns by reef here
reef_order <- meta %>% distinct(Reef, Lat) %>%
group_by(Reef) %>%
summarise(Lat = mean(Lat)) %>%
arrange(Lat) %>%
pull(Reef)
pcoa_pver <- cmdscale(dist_pver, eig = TRUE) #this is doing 'cmds' or classic multidimensional scaling
ordiplot(pcoa_pver, display = 'sites', type = 'text')
barplot(pcoa_pver$eig, names = paste ('PCoA', 1:152), las = 3, ylab = 'eigenvalues')
MDSxy.pver <- data.frame(pcoa_pver$points) %>%
rownames_to_column(var = "sample_name") %>%
left_join(., meta)
pverPCA <- MDSxy.pver %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(X1, X2, fill = Reef)) +
geom_point(alpha = 1, shape = 21, size = 3) +
scale_fill_viridis_d(option = "magma", direction = -1)
pverPCA
pcoa_pmh8 <- cmdscale(dist_pmh8, eig = TRUE) #this is doing 'cmds' or classic multidimensional scaling
ordiplot(pcoa_pmh8, display = 'sites', type = 'text')
barplot(pcoa_pmh8$eig, names = paste ('PCoA', 1:172), las = 3, ylab = 'eigenvalues')
MDSxy.pmh8 <- data.frame(pcoa_pmh8$points) %>%
rownames_to_column(var = "sample_name") %>%
left_join(., meta)
pmh8PCA <- MDSxy.pmh8 %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(X1, X2, fill = Reef)) +
geom_point(alpha = 1, shape = 21, size = 3) +
scale_fill_viridis_d(option = "magma", direction = -1)
pmh8PCA
pcoa_acro <- cmdscale(dist_acro, eig = TRUE) #this is doing 'cmds' or classic multidimensional scaling
ordiplot(pcoa_acro, display = 'sites', type = 'text')
barplot (pcoa_acro$eig, names = paste ('PCoA', 1:260), las = 3, ylab = 'eigenvalues')
MDSxy.acro <- data.frame(pcoa_acro$points) %>%
rownames_to_column(var = "sample_name") %>%
left_join(., meta)
acroPCA <- MDSxy.acro %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(X1, X2, fill = Reef)) +
geom_point(alpha = 1, shape = 21, size = 3) +
#ggsci::scale_fill_d3(palette = "category20") #+
scale_fill_viridis_d(option = "magma", direction = -1)
# scale_fill_manual(values = reef_pal) #+
#theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))
acroPCA
pverPCA + pmh8PCA + acroPCA
Fig SX. PCoA of each species symbiont communities
#importing unifrac distances for analysis
meta_pver <- meta %>%
filter(mtORF == "Pverrucosa") %>%
filter(sample_name %in% keepers_ss$sample_name) %>%
mutate(catBleaching = as.numeric(catBleaching)) %>%
tibble::column_to_rownames(var = "sample_name") %>%
mutate(sample_name = rownames(.))
dist_pver_ordered <- dist_subset(dist_pver, rownames(meta_pver))
# Make a correlation matrix of numeric variables
cm <- cor(meta_pver %>% select(Depth, maxDHW, meanDHW, recent.maxDHW,
recent.meanDHW, DHW3, DHW4, DHW6, DHW8,
DHW9, rangeSST, varSST, MMM, Lat))
corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant
cm %>% cor()
## Depth maxDHW meanDHW recent.maxDHW recent.meanDHW
## Depth 1.00000000 -0.3747247 -0.1815539 -0.29551269 0.1602795
## maxDHW -0.37472467 1.0000000 0.5774847 0.97359709 0.1838974
## meanDHW -0.18155391 0.5774847 1.0000000 0.65767097 0.7855995
## recent.maxDHW -0.29551269 0.9735971 0.6576710 1.00000000 0.3590438
## recent.meanDHW 0.16027947 0.1838974 0.7855995 0.35904376 1.0000000
## DHW3 -0.24926416 0.5463224 0.8962680 0.59328239 0.5323811
## DHW4 -0.36324104 0.8192436 0.8026414 0.80591540 0.3158608
## DHW6 -0.02146289 0.4397345 0.9134538 0.55363040 0.8476837
## DHW8 -0.09437930 0.3798900 0.9489017 0.49383156 0.9000039
## DHW9 -0.07846512 0.1613627 0.8756751 0.26558333 0.8749837
## rangeSST 0.01304759 -0.5078297 -0.8707069 -0.64716282 -0.9279753
## varSST -0.04530179 -0.4159139 -0.8598618 -0.56570304 -0.9602305
## MMM 0.39542721 -0.4244711 0.2864056 -0.24602109 0.7857945
## Lat -0.30686699 0.2075915 -0.5125898 0.02496643 -0.9109919
## DHW3 DHW4 DHW6 DHW8 DHW9
## Depth -0.24926416 -0.36324104 -0.02146289 -0.0943793 -0.07846512
## maxDHW 0.54632238 0.81924362 0.43973446 0.3798900 0.16136266
## meanDHW 0.89626804 0.80264144 0.91345377 0.9489017 0.87567507
## recent.maxDHW 0.59328239 0.80591540 0.55363040 0.4938316 0.26558333
## recent.meanDHW 0.53238105 0.31586079 0.84768366 0.9000039 0.87498366
## DHW3 1.00000000 0.85552516 0.73344231 0.7551756 0.68912001
## DHW4 0.85552516 1.00000000 0.69712721 0.6244913 0.45419667
## DHW6 0.73344231 0.69712721 1.00000000 0.9476503 0.84778489
## DHW8 0.75517560 0.62449133 0.94765025 1.0000000 0.94959759
## DHW9 0.68912001 0.45419667 0.84778489 0.9495976 1.00000000
## rangeSST -0.61740960 -0.56154282 -0.90049909 -0.9211093 -0.80516316
## varSST -0.59834596 -0.50642870 -0.91139180 -0.9342055 -0.83994732
## MMM 0.03286723 -0.31572098 0.41847542 0.5158353 0.63053196
## Lat -0.25297629 0.07931285 -0.60053164 -0.7026860 -0.77933963
## rangeSST varSST MMM Lat
## Depth 0.01304759 -0.04530179 0.39542721 -0.30686699
## maxDHW -0.50782966 -0.41591386 -0.42447110 0.20759147
## meanDHW -0.87070694 -0.85986184 0.28640564 -0.51258976
## recent.maxDHW -0.64716282 -0.56570304 -0.24602109 0.02496643
## recent.meanDHW -0.92797525 -0.96023048 0.78579452 -0.91099192
## DHW3 -0.61740960 -0.59834596 0.03286723 -0.25297629
## DHW4 -0.56154282 -0.50642870 -0.31572098 0.07931285
## DHW6 -0.90049909 -0.91139180 0.41847542 -0.60053164
## DHW8 -0.92110935 -0.93420546 0.51583531 -0.70268602
## DHW9 -0.80516316 -0.83994732 0.63053196 -0.77933963
## rangeSST 1.00000000 0.99373739 -0.52353597 0.70620277
## varSST 0.99373739 1.00000000 -0.60278844 0.77070312
## MMM -0.52353597 -0.60278844 1.00000000 -0.96777268
## Lat 0.70620277 0.77070312 -0.96777268 1.00000000
# Check the vif scores of the full model
ord_pver_full <- dbrda(dist_pver_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW +
recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 +
DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_pver)
sort(vif.cca(ord_pver_full))
## catBleaching Depth DHW6 DHW3 recent.maxDHW
## 1.111768 1.515812 8.142266 9.271732 10.991559
## DHW8 maxDHW DHW9 DHW4 meanDHW
## 11.098885 15.964914 21.303216 30.863670 58.933962
## rangeSST varSST Lat recent.meanDHW MMM
## 65.268575 134.814585 188.594615 190.309030 419.108422
#many correlated thermal history variables. We can definitely keep: depth, catBleaching.
#keep lat, remove MMM -> one variable to reflect latitudinal gardient
#keep varSST, remove rangeSST -> one variable to reflect variation throughout the year
#keep recent.maxDHW -> one variable to reflect recent thermal history (would expect to shape more than long-term)
# Reduce the model (consult the corplot and vif scores)
ord_pver <- dbrda(dist_pver_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_pver)
# Re check the new vif scores
sort(vif.cca(ord_pver))
## catBleaching Depth recent.maxDHW Lat varSST
## 1.023056 1.044675 1.480152 1.993197 2.225752
# Use ordistep to further refine the model
os_pver_backward <- ordistep(ord_pver, direction = "backward", permutations = 999)
##
## Start: dist_pver_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW
##
## Df AIC F Pr(>F)
## - catBleaching 1 223.80 0.9427 0.459
## - varSST 1 223.87 1.0134 0.383
## - Lat 1 224.03 1.1628 0.293
## - recent.maxDHW 1 224.45 1.5702 0.125
## - Depth 1 225.09 2.1917 0.028 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: dist_pver_ordered ~ Lat + Depth + varSST + recent.maxDHW
##
## Df AIC F Pr(>F)
## - varSST 1 222.80 0.9729 0.404
## - Lat 1 223.16 1.3210 0.202
## - recent.maxDHW 1 223.32 1.4789 0.156
## - Depth 1 224.06 2.2064 0.035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: dist_pver_ordered ~ Lat + Depth + recent.maxDHW
##
## Df AIC F Pr(>F)
## - recent.maxDHW 1 222.45 1.6085 0.110
## - Depth 1 223.06 2.2115 0.031 *
## - Lat 1 223.62 2.7648 0.010 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: dist_pver_ordered ~ Lat + Depth
##
## Df AIC F Pr(>F)
## - Depth 1 222.76 2.2809 0.028 *
## - Lat 1 223.62 3.1452 0.009 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(os_pver_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: dbrda(formula = dist_pver_ordered ~ Lat + Depth, data = meta_pver)
## Df SumOfSqs F Pr(>F)
## Lat 1 0.0883 3.1452 0.012 *
## Depth 1 0.0640 2.2809 0.025 *
## Residual 149 4.1812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Permutation test for dbrda under NA model
# Marginal effects of terms
# Permutation: free
# Number of permutations: 999
#
# Model: dbrda(formula = dist_pver ~ Lat, data = meta_pver)
# Df SumOfSqs F Pr(>F)
# Lat 1 0.0494 1.7237 0.086 .
# Residual 150 4.3017
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#variance partitioning to see how much variation is explained by each factor
varp <- varpart(dist_pver_ordered, ~Lat, ~Depth, ~recent.maxDHW, data = meta_pver)
plot(varp, digits = 2, Xnames = c("Lat", "Depth", "recent.maxDHW"), bg = c("navy", "tomato", "orange"))
#extract dbRDA scores
pver_scores <- as.data.frame(scores(os_pver_backward, display = "sites")) %>%
tibble::rownames_to_column(var = "sample_name") %>%
left_join(., meta_pver)
#extract dbRDA vectors
pver_vectors <- as.data.frame(os_pver_backward$CCA$biplot) %>%
tibble::rownames_to_column(var = "factors")
#produce dbRDA plot
pver_fullrda <- pver_scores %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(x = dbRDA1, y = dbRDA2)) +
geom_point(aes(fill = Lat), size = 4, shape = 21) +
geom_label_repel(data = pver_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 0.75, size = 3, segment.colour = NA) +
geom_segment(data = pver_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2),
size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
theme(legend.position = "right", aspect.ratio = 1, text = element_text(size = 15)) +
scale_fill_viridis_c(option = "magma", direction = -1) +
theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))
pver_fullrda
#importing unifrac distances for analysis
meta_pmh8 <- meta %>%
filter(mtORF == "Pmeandrina" | mtORF == "Haplotype8a") %>%
filter(sample_name %in% keepers_ss$sample_name) %>%
mutate(catBleaching = as.numeric(catBleaching)) %>%
tibble::column_to_rownames(var = "sample_name") %>%
mutate(sample_name = rownames(.))
dist_pmh8_ordered <- dist_subset(dist_pmh8, rownames(meta_pmh8))
# Make a correlation matrix of numeric variables
cm <- cor(meta_pmh8 %>% select(Depth, maxDHW, meanDHW, recent.maxDHW,
recent.meanDHW, DHW3, DHW4, DHW6, DHW8,
DHW9, rangeSST, varSST, MMM, Lat))
corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant
# Check the vif scores of the full model
ord_pmh8_full <- dbrda(dist_pmh8_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW +
recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 +
DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_pmh8)
sort(vif.cca(ord_pmh8_full))
## catBleaching Depth DHW4 DHW6 DHW8
## 1.188304 1.817157 5.924381 6.117623 10.000429
## DHW9 maxDHW DHW3 recent.maxDHW meanDHW
## 10.132665 11.382971 12.028073 12.650027 50.092986
## rangeSST recent.meanDHW varSST Lat MMM
## 63.799068 115.231738 152.920843 200.478794 383.137823
#insert vif scores here
# Reduce the model (consult the corplot and vif scores)
ord_pmh8 <- dbrda(dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_pmh8)
# Re check the new vif scores
sort(vif.cca(ord_pmh8))
## catBleaching Depth recent.maxDHW varSST Lat
## 1.121912 1.222941 1.335107 3.509340 4.322619
#insert vif scores here
# Use ordistep to further refine the model
ord_pmh8_backward <- ordistep(ord_pmh8, direction = "backward", permutations = 999)
##
## Start: dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW
##
## Df AIC F Pr(>F)
## - recent.maxDHW 1 258.30 0.7066 0.568
## - Depth 1 259.28 1.6597 0.156
## - catBleaching 1 263.93 6.2589 0.002 **
## - varSST 1 264.88 7.2119 0.001 ***
## - Lat 1 278.21 21.1719 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST
##
## Df AIC F Pr(>F)
## - Depth 1 258.44 2.0895 0.075 .
## - catBleaching 1 262.67 6.3031 0.003 **
## - varSST 1 263.94 7.5903 0.001 ***
## - Lat 1 281.85 26.7498 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ord_pmh8_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: dbrda(formula = dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST, data = meta_pmh8)
## Df SumOfSqs F Pr(>F)
## Lat 1 0.6825 26.7498 0.001 ***
## Depth 1 0.0533 2.0895 0.070 .
## catBleaching 1 0.1608 6.3031 0.002 **
## varSST 1 0.1936 7.5903 0.001 ***
## Residual 167 4.2607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#insert model outputs here
#variance partitioning: how much does each variable explain in the model? residuals?
varp <- varpart(dist_pmh8_ordered, ~Lat, ~catBleaching, ~varSST, data = meta_pmh8)
plot(varp, digits = 2, Xnames = c("Lat", "catBleaching", "varSST"), bg = c("navy", "tomato", "orange"))
#extract scores from dbRDA
pmh8_scores <- as.data.frame(scores(ord_pmh8_backward, display = "sites")) %>%
tibble::rownames_to_column(var = "sample_name") %>%
left_join(., meta_pmh8)
#extract vectors from dbRDA
pmh8_vectors <- as.data.frame(ord_pmh8_backward$CCA$biplot) %>%
tibble::rownames_to_column(var = "factors")
#produce dbRDA plot
pmh8_fullrda <- pmh8_scores %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(x = dbRDA1, y = dbRDA2)) +
geom_point(aes(fill = Lat), size = 4, shape = 21) +
geom_label_repel(data = pmh8_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 0.5, size = 3, segment.colour = NA) +
geom_segment(data = pmh8_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2),
size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
theme(aspect.ratio = 1, text = element_text(size = 15)) +
guides(fill = guide_colorbar (reverse = T)) +
scale_fill_viridis_c(option = "magma", direction = -1) +
theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))
pmh8_fullrda
library(ggrepel)
#identify 7 outlier samples and remove from data set
outlier_samples <- c("Plate6_D012", "Plate6_E009", "Plate6_H011", "Plate6_B012", "Plate7_D006", "Plate6_G009", "Plate7_H009", "Plate7_A010", "Plate6_D011")
#importing unifrac distances for analysis
meta_acro <- meta %>%
filter(mtORF == "Ahumilis") %>%
filter(sample_name %in% keepers_ss$sample_name) %>%
filter(!(sample_name %in% outlier_samples)) %>%
mutate(catBleaching = as.factor(catBleaching)) %>%
tibble::column_to_rownames(var = "sample_name") %>%
mutate(sample_name = rownames(.))
dist_acro_ordered <- dist_subset(dist_acro, rownames(meta_acro))
# Make a correlation matrix of numeric variables
cm <- cor(meta_acro %>% select(Depth, maxDHW, meanDHW, recent.maxDHW,
recent.meanDHW, DHW3, DHW4, DHW6, DHW8,
DHW9, rangeSST, varSST, MMM, Lat))
corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant
# Check the vif scores of the full model
ord_acro_full <- dbrda(dist_acro_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW +
recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 +
DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_acro)
sort(vif.cca(ord_acro_full))
## catBleaching6 Depth catBleaching2 catBleaching3 DHW6
## 1.435074 1.616944 2.773975 4.290302 5.117525
## catBleaching4 catBleaching5 DHW8 DHW3 DHW4
## 5.440155 5.573908 6.273625 7.617778 8.934821
## maxDHW DHW9 recent.maxDHW meanDHW recent.meanDHW
## 15.324583 15.526488 16.970046 33.230620 107.130974
## rangeSST varSST Lat MMM
## 164.878761 187.998043 795.502964 864.078437
#insert vif scores here
# Reduce the model (consult the corplot and vif scores)
ord_acro <- dbrda(dist_acro_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_acro)
# Re check the new vif scores
sort(vif.cca(ord_acro))
## Depth catBleaching6 recent.maxDHW varSST catBleaching2
## 1.178178 1.295819 1.728728 1.857405 2.494004
## Lat catBleaching3 catBleaching5 catBleaching4
## 2.835694 3.412909 4.107573 4.268968
#insert vif scores here
# Use ordistep to further refine the model
ord_acro_backward <- ordistep(ord_acro, direction = "backward", permutations = 999)
##
## Start: dist_acro_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW
##
## Df AIC F Pr(>F)
## - catBleaching 5 569.62 1.2385 0.212
## - Depth 1 573.41 2.0894 0.063 .
## - varSST 1 575.34 3.9582 0.008 **
## - Lat 1 578.53 7.0967 0.001 ***
## - recent.maxDHW 1 591.62 20.3815 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: dist_acro_ordered ~ Lat + Depth + varSST + recent.maxDHW
##
## Df AIC F Pr(>F)
## - Depth 1 569.80 2.1485 0.073 .
## - varSST 1 571.56 3.8977 0.007 **
## - Lat 1 575.42 7.7651 0.001 ***
## - recent.maxDHW 1 587.07 19.8207 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ord_acro_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: dbrda(formula = dist_acro_ordered ~ Lat + Depth + varSST + recent.maxDHW, data = meta_acro)
## Df SumOfSqs F Pr(>F)
## Lat 1 0.2946 7.7651 0.001 ***
## Depth 1 0.0815 2.1485 0.079 .
## varSST 1 0.1479 3.8977 0.007 **
## recent.maxDHW 1 0.7520 19.8207 0.001 ***
## Residual 246 9.3328
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#insert model outputs here
#variance partitioning: how much does each variable explain in the model? residuals?
varp <- varpart(dist_acro_ordered, ~Lat, ~varSST, ~recent.maxDHW, data = meta_acro)
plot(varp, digits = 2, Xnames = c("Lat", "varSST", "recent.maxDHW"), bg = c("navy", "tomato", "orange"))
#extract scores from dbRDA
acro_scores <- as.data.frame(scores(ord_acro_backward, display = "sites")) %>%
tibble::rownames_to_column(var = "sample_name") %>%
left_join(., meta_acro)
#extract vectors from dbRDA
acro_vectors <- as.data.frame(ord_acro_backward$CCA$biplot) %>%
tibble::rownames_to_column(var = "factors")
#produce dbRDA plot
acro_fullrda <- acro_scores %>%
mutate(Reef = fct_relevel(Reef, reef_order)) %>%
ggplot(aes(x = dbRDA1, y = dbRDA2)) +
geom_point(aes(fill = Lat), size = 4, shape = 21) +
geom_label_repel(data = acro_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 1, size = 3, segment.colour = NA) +
geom_segment(data = acro_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2), size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
theme(aspect.ratio = 1, text = element_text(size = 15)) +
guides(fill = guide_colorbar (reverse = T)) +
scale_fill_viridis_c(option = "magma", direction = -1)
#scale_fill_d3(palette = "category20")
acro_fullrda
dbrda <- pver_fullrda + pmh8_fullrda + acro_fullrda
dbrda
library(ggraph)
library(igraph)
library(tidyverse)
library(tidygraph)
library(assertthat)
library(purrr)
library(ggplot2)
library(dplyr)
#list of sequences per reef.
pver_seqreef <- pver_seqs %>%
distinct(name, Reef)
#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs
remove_list <- pver_seqreef %>%
filter(!str_detect(name, "A|G")) %>%
dplyr::count(name) %>%
filter(n >= 11) %>%
mutate(name = as.character(name)) %>%
pull(name)
#within each reef, get the sum of sequences across all samples collected.
pver_reefseqs = pver_seqs %>%
filter(!(name %in% remove_list)) %>%
select(name, Reef, sample_name, value ) %>%
group_by(Reef, name) %>%
summarise(value = sum(value)) %>%
select(from = name, to = Reef, value)
pver_network <- pver_reefseqs %>%
as_tbl_graph() %>% # convert to tidygraph format
activate(nodes) %>%
mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef",
TRUE ~ "Sequence"), # add node attribute info
text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
ggraph(layout = "fr") +
geom_edge_link(aes(width = value, alpha = value)) +
scale_edge_width(range = c(0.2, 2.5)) + # control size
geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
geom_node_text(aes(label = text_label), repel = TRUE) +
scale_shape_manual(values = c(21, 22)) +
scale_fill_viridis_c(option = "magma")
pver_network
#results <- pver_network$data
#list of sequences per reef.
pmh8_seqreef <- pmh8_seqs %>%
distinct(name, Reef)
#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs
remove_list <- pmh8_seqreef %>%
filter(!str_detect(name, "A|G")) %>%
dplyr::count(name) %>%
filter(n >= 12) %>%
mutate(name = as.character(name)) %>%
pull(name)
#within each reef, get the sum of sequences across all samples collected.
pmh8_reefseqs = pmh8_seqs %>%
filter(!(name %in% remove_list)) %>%
select(name, Reef, sample_name, value ) %>%
group_by(Reef, name) %>%
summarise(value = sum(value)) %>%
select(from = name, to = Reef, value)
pmh8_network <- pmh8_reefseqs %>%
as_tbl_graph() %>% # convert to tidygraph format
activate(nodes) %>%
mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef",
TRUE ~ "Sequence"), # add node attribute info
text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
ggraph(layout = "fr") +
geom_edge_link(aes(width = value, alpha = value)) +
scale_edge_width(range = c(0.2, 2.5)) + # control size
geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
geom_node_text(aes(label = text_label), repel = TRUE) +
scale_shape_manual(values = c(21, 22)) +
scale_fill_viridis_c(option = "magma")
pmh8_network
#list of sequences per reef.
acro_seqreef <- acro_seqs %>%
distinct(name, Reef)
#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs
remove_list <- acro_seqreef %>%
filter(!str_detect(name, "A|G")) %>%
dplyr::count(name) %>%
filter(n >= 12) %>%
mutate(name = as.character(name)) %>%
pull(name)
#within each reef, get the sum of sequences across all samples collected.
acro_reefseqs = acro_seqs %>%
filter(!(name %in% remove_list)) %>%
select(name, Reef, sample_name, value ) %>%
group_by(Reef, name) %>%
summarise(value = sum(value)) %>%
select(from = name, to = Reef, value)
acro_network <- acro_reefseqs %>%
as_tbl_graph() %>% # convert to tidygraph format
activate(nodes) %>%
mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef",
TRUE ~ "Sequence"), # add node attribute info
text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
ggraph(layout = "fr") +
geom_edge_link(aes(width = value, alpha = value)) +
scale_edge_width(range = c(0.2, 2.5)) + # control size
geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
geom_node_text(aes(label = text_label), repel = TRUE) +
scale_shape_manual(values = c(21, 22)) +
scale_fill_viridis_c(option = "magma")
acro_network
pver_network + pmh8_network + acro_network